{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征工程 取对数log1p(对线性模型很重要，单调变换树模型影响不大)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                           148              72   \n",
       "1          1                            85              66   \n",
       "2          8                           183              64   \n",
       "3          1                            89              66   \n",
       "4          0                           137              40   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                           35              0  33.6   \n",
       "1                           29              0  26.6   \n",
       "2                            0              0  23.3   \n",
       "3                           23             94  28.1   \n",
       "4                           35            168  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \n",
       "0                       0.627   50       1  \n",
       "1                       0.351   31       0  \n",
       "2                       0.672   32       1  \n",
       "3                       0.167   21       0  \n",
       "4                       2.288   33       1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(r\"C:\\Users\\yun\\Desktop\\pima-indians-diabetes.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分开特征和标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train=train['Target']\n",
    "X_train=train.drop('Target',axis=1)\n",
    "column_org=X_train.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. feat 编码：log(x+1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants_log</th>\n",
       "      <th>Plasma_glucose_concentration_log</th>\n",
       "      <th>blood_pressure_log</th>\n",
       "      <th>Triceps_skin_fold_thickness_log</th>\n",
       "      <th>serum_insulin_log</th>\n",
       "      <th>BMI_log</th>\n",
       "      <th>Diabetes_pedigree_function_log</th>\n",
       "      <th>Age_log</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.945910</td>\n",
       "      <td>5.003946</td>\n",
       "      <td>4.290459</td>\n",
       "      <td>3.583519</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.543854</td>\n",
       "      <td>0.486738</td>\n",
       "      <td>3.931826</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.693147</td>\n",
       "      <td>4.454347</td>\n",
       "      <td>4.204693</td>\n",
       "      <td>3.401197</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.317816</td>\n",
       "      <td>0.300845</td>\n",
       "      <td>3.465736</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.197225</td>\n",
       "      <td>5.214936</td>\n",
       "      <td>4.174387</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.190476</td>\n",
       "      <td>0.514021</td>\n",
       "      <td>3.496508</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.693147</td>\n",
       "      <td>4.499810</td>\n",
       "      <td>4.204693</td>\n",
       "      <td>3.178054</td>\n",
       "      <td>4.553877</td>\n",
       "      <td>3.370738</td>\n",
       "      <td>0.154436</td>\n",
       "      <td>3.091042</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>4.927254</td>\n",
       "      <td>3.713572</td>\n",
       "      <td>3.583519</td>\n",
       "      <td>5.129899</td>\n",
       "      <td>3.786460</td>\n",
       "      <td>1.190279</td>\n",
       "      <td>3.526361</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants_log  Plasma_glucose_concentration_log  blood_pressure_log  \\\n",
       "0       1.945910                          5.003946            4.290459   \n",
       "1       0.693147                          4.454347            4.204693   \n",
       "2       2.197225                          5.214936            4.174387   \n",
       "3       0.693147                          4.499810            4.204693   \n",
       "4       0.000000                          4.927254            3.713572   \n",
       "\n",
       "   Triceps_skin_fold_thickness_log  serum_insulin_log   BMI_log  \\\n",
       "0                         3.583519           0.000000  3.543854   \n",
       "1                         3.401197           0.000000  3.317816   \n",
       "2                         0.000000           0.000000  3.190476   \n",
       "3                         3.178054           4.553877  3.370738   \n",
       "4                         3.583519           5.129899  3.786460   \n",
       "\n",
       "   Diabetes_pedigree_function_log   Age_log  Target  \n",
       "0                        0.486738  3.931826       1  \n",
       "1                        0.300845  3.465736       0  \n",
       "2                        0.514021  3.496508       1  \n",
       "3                        0.154436  3.091042       0  \n",
       "4                        1.190279  3.526361       1  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_log=np.log1p(X_train)\n",
    "feat_names=column_org+'_log'\n",
    "X_train_log=pd.DataFrame(columns=feat_names,data=X_train_log.values)\n",
    "train = pd.concat([X_train_log, y_train], axis = 1)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.to_csv('Log1p_pima-indians-diabetes.csv',index = False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
